import pandas as pd
import json
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import sys
import importlib
sys.path.insert(0, '../')
import general_utils as gen_ut
sys.path.insert(0, '../3_Hashtag_study')
import hashtag_util as ut_ht
sys.path.insert(0, '../4_URL_study')
import url_util as ut_url
with open('../1_Dataset_preparation/listControlledUsers.json','r') as file_object:
data = json.load(file_object)
listSuspect = data['Novax']
def df_preparation(cols,filename='../tweets.csv'):
columns = list(set(cols + ['id','user_screen_name','rt_user_screen_name','in_reply_to_screen_name']))
df = pd.read_csv(filename,low_memory=False, usecols=columns)
dfSuspect = pd.DataFrame()
for sus in listSuspect:
df1 = df.loc[df['user_screen_name'] == sus,:]
dfSuspect = pd.concat([df1,dfSuspect],ignore_index=True)
df1 = df.loc[df['rt_user_screen_name'] == sus,:]
dfSuspect = pd.concat([df1,dfSuspect],ignore_index=True)
df1 = df.loc[df['in_reply_to_screen_name'] == sus,:]
dfSuspect = pd.concat([df1,dfSuspect],ignore_index=True)
del df1
dfSuspect.drop_duplicates(subset=['id'])
return dfSuspect
df = df_preparation(['urls','created_at'])
df['created_at'] = pd.to_datetime(df['created_at'], format="%a %b %d %X %z %Y")
df
| id | created_at | user_screen_name | in_reply_to_screen_name | rt_user_screen_name | urls | |
|---|---|---|---|---|---|---|
| 0 | 1315035428098015237 | 2020-10-10 21:04:12+00:00 | KurtRoll3 | borghi_claudio | NaN | [] |
| 1 | 1314915759932534788 | 2020-10-10 13:08:41+00:00 | Giovanni1946861 | borghi_claudio | NaN | [] |
| 2 | 1314824033154871296 | 2020-10-10 07:04:12+00:00 | Franco_Barile1 | borghi_claudio | NaN | [] |
| 3 | 1315421329072545793 | 2020-10-11 22:37:38+00:00 | AnnamariaCorra7 | borghi_claudio | NaN | [] |
| 4 | 1315366608202805248 | 2020-10-11 19:00:12+00:00 | gaetanovetri1 | borghi_claudio | NaN | [] |
| ... | ... | ... | ... | ... | ... | ... |
| 442502 | 1394545421578604548 | 2021-05-18 06:48:32+00:00 | IacobellisT | NaN | NaN | [] |
| 442503 | 1394545570417725446 | 2021-05-18 06:49:08+00:00 | IacobellisT | NaN | rubino7004 | [] |
| 442504 | 1394545509868707841 | 2021-05-18 06:48:53+00:00 | IacobellisT | NaN | GiuseppePalma78 | [] |
| 442505 | 1394545510346895363 | 2021-05-18 06:48:53+00:00 | IacobellisT | NaN | byoblu | [] |
| 442506 | 1394545697526067201 | 2021-05-18 06:49:38+00:00 | IacobellisT | NaN | valy_s | [] |
442507 rows × 6 columns
dfUrls = ut_url.df_url_counter(df)
dfUrls = ut_url.url_credibility('../1_Dataset_preparation/data/credibility.csv',dfUrls)
dfUrls
| count | Class | |
|---|---|---|
| url | ||
| twitter.com | 15416 | none |
| youtube.com | 5114 | none |
| imolaoggi.it | 3436 | low |
| iltempo.it | 1336 | high |
| ansa.it | 1297 | high |
| ... | ... | ... |
| dailyhealthindustry.it | 1 | none |
| dailymirror.lk | 1 | none |
| dailypostusa.com | 1 | none |
| dailystar.co.uk | 1 | low |
| …xevxq4s5y-1scandal-com.translate.goog | 1 | none |
1502 rows × 2 columns
sum(dfUrls['count'])
76991
n = 20
fig = px.histogram(dfUrls.head(n),y=dfUrls.head(n).index,x='count',
title="The most %d url used in the tweets"% n,orientation = 'h',color='Class')
fig.update_yaxes(title='URL name')
fig.update_yaxes(categoryorder='total descending')
fig.show()
#Creation of a dictionary of num of use per date
my_dict = {"url":[],"date":[], "count":[]};
for i in range(len(df)):
s = df.loc[i,'urls']
d = df.loc[i,'created_at']
url = gen_ut.get_string_json(s,'display_url')
if url:
url = url[0].split("//")
url = url[0].split("/")
my_dict["url"].append(url[0])
my_dict["date"].append(d)
my_dict["count"].append(1)
dfUseUrl = pd.DataFrame.from_dict(my_dict)
dfUseUrl['Week/Year'] = dfUseUrl['date'].apply(lambda x: "%d-%d" % (x.isocalendar()[1] , x.isocalendar()[0]))
dfUseUrl.drop(['date'], axis=1,inplace=True)
dfUseUrl = dfUseUrl.groupby(['Week/Year', 'url']).sum()
dfUseUrl.reset_index(inplace=True)
dfUseUrl['Week/Year'] =pd.to_datetime(dfUseUrl['Week/Year']+ '-1', format="%W-%Y-%w")
dfUseUrl.sort_values(['Week/Year'],axis = 0,inplace=True,ascending=True)
dfUseUrl
| Week/Year | url | count | |
|---|---|---|---|
| 0 | 2020-01-06 | ANSA.it | 1 |
| 14 | 2020-01-06 | youtu.be | 1 |
| 12 | 2020-01-06 | twitter.com | 14 |
| 11 | 2020-01-06 | torino.repubblica.it | 1 |
| 10 | 2020-01-06 | threader.app | 1 |
| ... | ... | ... | ... |
| 2774 | 2021-05-17 | ilrestodelcarlino.it | 2 |
| 2775 | 2021-05-17 | ilsimplicissimus2.com | 3 |
| 2776 | 2021-05-17 | ilsole24ore.com | 2 |
| 2760 | 2021-05-17 | genova24.it | 2 |
| 2861 | 2021-05-17 | vcomevittoria.it | 74 |
5723 rows × 3 columns
#All in the same graphic
fig = go.Figure()
for w in dfUrls.head(10).index:
mask = dfUseUrl['url'] == w
fig.add_trace(go.Scatter(x=dfUseUrl.loc[mask,'Week/Year'], y=dfUseUrl.loc[mask,'count'],
mode='lines+markers',
name=w))
fig.update_layout(title='All url history use',xaxis_title='Date',yaxis_title='use count')
fig.show()
# All in different graphic
for w in dfUrls.head(10).index:
fig = go.Figure()
mask = dfUseUrl['url'] == w
fig.add_trace(go.Scatter(x=dfUseUrl.loc[mask,'Week/Year'], y=dfUseUrl.loc[mask,'count'],
mode='lines+markers',
name=w))
fig.update_layout(title="History use of url '%s'"%w,xaxis_title='Date',yaxis_title='use count')
fig.show()
#Creating a map of all urls about youtube with the number of uses
listUrls = []
for s in df['urls']:
urls = gen_ut.get_string_json(s,'display_url')
for url in urls:
if "youtu.be" in url or "youtube.com" in url:
if url:
#url = url.split("//")
#url = url[0].split("/")
listUrls.append(url)
dfUrlsYt = pd.DataFrame()
dfUrlsYt['url'] = listUrls
dfUrlsYt['count'] = 1
dfUrlsYt = dfUrlsYt.groupby('url').sum()
dfUrlsYt.sort_values(['count'], axis = 0,inplace=True,ascending=False)
dfUrlsYt
| count | |
|---|---|
| url | |
| youtube.com/watch?v=U-kC9X… | 472 |
| youtube.com/watch?v=xgfQfD… | 346 |
| youtube.com/watch?v=-BbnIl… | 276 |
| youtube.com/watch?v=q04zOg… | 155 |
| youtube.com/watch?v=GzXbsu… | 141 |
| ... | ... |
| youtu.be/e1R7l28JJeo | 1 |
| youtu.be/eK9A3VjaBaU | 1 |
| youtu.be/eqvFlgydWVw | 1 |
| youtube.com/watch?v=R2ok5T… | 1 |
| m.youtube.com/watch?v=haxx_B… | 1 |
726 rows × 1 columns
n = 20
fig = px.histogram(dfUrlsYt.head(n),y=dfUrlsYt.head(n).index,x='count',
title="The most %d url about youtube used in the tweets"% n,orientation = 'h')
fig.update_yaxes(title='URL name')
fig.show()
from urllib.request import urlopen
'''
for i,c in dfUrlsYt.iterrows():
#if "watch" not in i:
print("http://"+i,"\t",c['count'])
urlopen("http://"+i).read()'''
'\nfor i,c in dfUrlsYt.iterrows():\n #if "watch" not in i:\n print("http://"+i,"\t",c[\'count\'])\n urlopen("http://"+i).read()'
#Creation of a dictionary of num of use per date
my_dict = {"url":[],"date":[], "count":[]};
for i in range(len(df)):
s = df.loc[i,'urls']
d = df.loc[i,'created_at']
urls = gen_ut.get_string_json(s,'display_url')
for url in urls:
if "youtu.be" in url or "youtube.com" in url:
if url:
#url = url.split("//")
#url = url[0].split("/")
listUrls.append(url)
my_dict["url"].append(url)
my_dict["date"].append(d)
my_dict["count"].append(1)
dfUseUrlYt = pd.DataFrame.from_dict(my_dict)
dfUseUrlYt['Week/Year'] = dfUseUrlYt['date'].apply(lambda x: "%d-%d" % (x.isocalendar()[1] , x.isocalendar()[0]))
dfUseUrlYt.drop(['date'], axis=1,inplace=True)
dfUseUrlYt = dfUseUrlYt.groupby(['Week/Year', 'url']).sum()
dfUseUrlYt.reset_index(inplace=True)
dfUseUrlYt['Week/Year'] =pd.to_datetime(dfUseUrlYt['Week/Year']+ '-1', format="%W-%Y-%w")
dfUseUrlYt.sort_values(['Week/Year'],axis = 0,inplace=True,ascending=True)
dfUseUrlYt
| Week/Year | url | count | |
|---|---|---|---|
| 0 | 2020-01-06 | youtu.be/WWcwInBEdGw | 1 |
| 549 | 2020-01-20 | youtu.be/JUKI1CEs7Lw | 1 |
| 550 | 2020-01-20 | youtu.be/JfDIie01QeE | 1 |
| 551 | 2020-01-20 | youtu.be/K9-PUInmA-I | 6 |
| 552 | 2020-01-20 | youtube.com/watch?v=UQpaNP… | 1 |
| ... | ... | ... | ... |
| 436 | 2021-05-17 | youtu.be/Fck6ZwNiq-w | 7 |
| 435 | 2021-05-17 | youtu.be/ERuI4arMu4c | 2 |
| 434 | 2021-05-17 | youtu.be/7k283DjMinw | 1 |
| 445 | 2021-05-17 | youtu.be/tdJoMIAZ334 | 1 |
| 440 | 2021-05-17 | youtu.be/_1KsDNQWrgU | 3 |
997 rows × 3 columns
#All in the same graphic
fig = go.Figure()
for w in dfUrlsYt.head().index:
mask = dfUseUrlYt['url'] == w
fig.add_trace(go.Scatter(x=dfUseUrlYt.loc[mask,'Week/Year'], y=dfUseUrlYt.loc[mask,'count'],
mode='lines+markers',
name=w))
fig.update_layout(title='All url history use',xaxis_title='Date',yaxis_title='use count')
fig.show()
# All in different graphic
for w in dfUrlsYt.head().index:
fig = go.Figure()
mask = dfUseUrlYt['url'] == w
fig.add_trace(go.Scatter(x=dfUseUrlYt.loc[mask,'Week/Year'], y=dfUseUrlYt.loc[mask,'count'],
mode='lines+markers',
name=w))
fig.update_layout(title="History use of url '%s'"%w,xaxis_title='Date',yaxis_title='use count')
fig.show()
mask = []
for url in df['urls']:
mask.append("youtu.be/16PvXtfdmX0" in url)
df[mask]
| id | created_at | user_screen_name | in_reply_to_screen_name | rt_user_screen_name | urls | |
|---|---|---|---|---|---|---|
| 312066 | 1251092982167678976 | 2020-04-17 10:19:45+00:00 | Samira1577 | NaN | MinervaMcGrani1 | [{'url': 'https://t.co/eAnhBta27L', 'expanded_... |
| 348633 | 1251065679660437504 | 2020-04-17 08:31:16+00:00 | gael99 | NaN | MinervaMcGrani1 | [{'url': 'https://t.co/eAnhBta27L', 'expanded_... |
| 368741 | 1251285545101201414 | 2020-04-17 23:04:56+00:00 | Lara_Eureka | NaN | MinervaMcGrani1 | [{'url': 'https://t.co/eAnhBta27L', 'expanded_... |
| 368766 | 1251280724264615937 | 2020-04-17 22:45:46+00:00 | chiara_m80 | NaN | MinervaMcGrani1 | [{'url': 'https://t.co/eAnhBta27L', 'expanded_... |
| 368825 | 1251267346485960705 | 2020-04-17 21:52:37+00:00 | Esticazzi1965 | NaN | MinervaMcGrani1 | [{'url': 'https://t.co/eAnhBta27L', 'expanded_... |
| ... | ... | ... | ... | ... | ... | ... |
| 373542 | 1288712339114602497 | 2020-07-30 05:45:39+00:00 | VittorioFerram1 | NaN | MinervaMcGrani1 | [{'url': 'https://t.co/eAnhBta27L', 'expanded_... |
| 390609 | 1251091137995161601 | 2020-04-17 10:12:26+00:00 | MinervaMcGrani1 | NaN | MinervaMcGrani1 | [{'url': 'https://t.co/eAnhBta27L', 'expanded_... |
| 390611 | 1251063946695639040 | 2020-04-17 08:24:23+00:00 | MinervaMcGrani1 | NaN | NaN | [{'url': 'https://t.co/eAnhBta27L', 'expanded_... |
| 419639 | 1251097730287575040 | 2020-04-17 10:38:37+00:00 | Z3r0Rules | NaN | MinervaMcGrani1 | [{'url': 'https://t.co/eAnhBta27L', 'expanded_... |
| 425808 | 1251238499245850626 | 2020-04-17 19:57:59+00:00 | xenonian1 | NaN | MinervaMcGrani1 | [{'url': 'https://t.co/eAnhBta27L', 'expanded_... |
115 rows × 6 columns
df = df_preparation(['hashtags','created_at'])
df['created_at'] = pd.to_datetime(df['created_at'], format="%a %b %d %X %z %Y")
#Creating a map of all hashtags with the number of uses
listHashtags = []
for s in df['hashtags']:
[ listHashtags.append(x) for x in gen_ut.get_string_json(s,'text') ]
dfHashtags = pd.DataFrame()
dfHashtags['hashtags'] = listHashtags
dfHashtags['count'] = 0
dfHashtags = dfHashtags.groupby('hashtags').count()
dfHashtags.sort_values(['count'],axis = 0,inplace=True,ascending=False)
dfHashtags
| count | |
|---|---|
| hashtags | |
| vaccino | 9830 |
| COVID19 | 9506 |
| Covid19 | 9393 |
| vaccini | 6177 |
| Pfizer | 4976 |
| ... | ... |
| SiamoTroppi | 1 |
| Sidney | 1 |
| SilvioBerlusconi | 1 |
| SilvioGarattini | 1 |
| 𝐕𝐀𝐂𝐂𝐈𝐍𝐈 | 1 |
4465 rows × 1 columns
dfUse = ut_ht.process_dfUse(df)
dfUse
| Week/Year | hashtag | count | |
|---|---|---|---|
| 0 | 2020-01-06 | AIFA | 3 |
| 8660 | 2020-01-06 | EmiliaRomagna | 1 |
| 8659 | 2020-01-06 | Burioni | 1 |
| 8658 | 2020-01-06 | Bonaccini | 1 |
| 8657 | 2020-01-06 | Bibbiano | 1 |
| ... | ... | ... | ... |
| 19005 | 2021-05-17 | Pfizer | 6 |
| 19004 | 2021-05-17 | Norimberga2 | 1 |
| 19003 | 2021-05-17 | NoObbligoVaccinale | 1 |
| 19017 | 2021-05-17 | VacciNazismo | 1 |
| 10910 | 2021-05-17 | virologi | 1 |
19520 rows × 3 columns
ut_ht.visual_histogram(dfHashtags,150)
hastagRemove = ['vaccin.*','covid.*','corona.*','astrazeneca','pfizer','sarscov2','sputnikv','moderna']
dfHashtagFiltered = dfHashtags
for r in hastagRemove:
mask = dfHashtagFiltered.index.str.lower().str.match(r) == True
dfHashtagFiltered.drop(dfHashtagFiltered[mask].index, inplace=True)
dfMoreFiltered = dfHashtagFiltered
hastagRemove = ['.*lombardia.*','draghi','conte','m5s','mattarella','salvini','speranza','renzi','lega','.*governo.*',
'.*moratti.*','zingaretti','scanzi','burioni','crisanti']
for r in hastagRemove:
mask = dfMoreFiltered.index.str.lower().str.match(r) == True
dfMoreFiltered.drop(dfMoreFiltered[mask].index, inplace=True)
ut_ht.visual_histogram(dfMoreFiltered,100)
ut_ht.visual_by_date_together(dfMoreFiltered,dfUse)
ut_ht.visual_by_date_split(dfMoreFiltered,dfUse)
df = df_preparation(['is_self_rt'])
dfSelf = df.groupby('user_screen_name').sum()
dfSelf['all_rt'] = df.groupby('user_screen_name').count().iloc[:,0]
dfSelf.sort_values(['all_rt'],inplace=True)
dfSelf
| id | is_self_rt | all_rt | |
|---|---|---|---|
| user_screen_name | |||
| SpaakMr | 1.378293e+18 | 0 | 1 |
| Samsung9900 | 1.389977e+18 | 0 | 1 |
| Samuel21091422 | 1.373232e+18 | 0 | 1 |
| remodalfonso | 1.293633e+18 | 0 | 1 |
| SanMichele33 | 1.353708e+18 | 0 | 1 |
| ... | ... | ... | ... |
| MarySpes | 6.301979e+21 | 0 | 4648 |
| xenonian1 | 6.688016e+21 | 24 | 5017 |
| Z3r0Rules | 7.216171e+21 | 392 | 5347 |
| Pietro_Otto | 7.410851e+21 | 0 | 5412 |
| Piero42395724 | 7.962673e+21 | 0 | 5835 |
24743 rows × 3 columns
n = 20
fig = make_subplots(rows=1, cols=1)
fig.add_trace(go.Bar(y=dfSelf.tail(n).index, x=dfSelf.tail(n)['all_rt'],orientation='h', name = 'All retweet'), row=1, col=1)
fig.add_trace(go.Bar(y=dfSelf.tail(n).index, x=dfSelf.tail(n)['is_self_rt'],orientation='h', name = 'Self retweet'), row=1, col=1)
fig.update_layout(title="How many retweet are self retweet (the most 20 retweeter)")
fig.update_xaxes(title="Count of retweets")
fig.update_yaxes(title="Username")
fig.update_layout(
barmode="overlay",
bargap=0.1)
fig.show()
df = df_preparation(['is_self_rt'])
df
| id | user_screen_name | in_reply_to_screen_name | rt_user_screen_name | is_self_rt | |
|---|---|---|---|---|---|
| 0 | 1315035428098015237 | KurtRoll3 | borghi_claudio | NaN | False |
| 1 | 1314915759932534788 | Giovanni1946861 | borghi_claudio | NaN | False |
| 2 | 1314824033154871296 | Franco_Barile1 | borghi_claudio | NaN | False |
| 3 | 1315421329072545793 | AnnamariaCorra7 | borghi_claudio | NaN | False |
| 4 | 1315366608202805248 | gaetanovetri1 | borghi_claudio | NaN | False |
| ... | ... | ... | ... | ... | ... |
| 442502 | 1394545421578604548 | IacobellisT | NaN | NaN | False |
| 442503 | 1394545570417725446 | IacobellisT | NaN | rubino7004 | False |
| 442504 | 1394545509868707841 | IacobellisT | NaN | GiuseppePalma78 | False |
| 442505 | 1394545510346895363 | IacobellisT | NaN | byoblu | False |
| 442506 | 1394545697526067201 | IacobellisT | NaN | valy_s | False |
442507 rows × 5 columns
dfRetweet = df.dropna(subset=['rt_user_screen_name']).copy()
dfRetweet.drop(columns=['in_reply_to_screen_name'],inplace=True,errors='ignore')
dfRetweet = dfRetweet.groupby('rt_user_screen_name').count()
dfRetweet.rename(columns={'user_screen_name':'all_rt'},inplace=True,errors='ignore')
dfRetweet['self_rt'] = df.dropna(subset=['rt_user_screen_name'
]).copy().groupby('rt_user_screen_name').sum().loc[:,'is_self_rt']
dfRetweet.drop(columns=['is_self_rt'],inplace=True,errors='ignore')
dfRetweet['real_rt'] = dfRetweet['all_rt'] - dfRetweet['self_rt']
dfRetweet.sort_values('real_rt',ascending=False,inplace=True)
dfRetweet
| id | all_rt | self_rt | real_rt | |
|---|---|---|---|---|
| rt_user_screen_name | ||||
| MinervaMcGrani1 | 28977 | 28977 | 0 | 28977 |
| BarbaraRaval | 21752 | 21752 | 70 | 21682 |
| valy_s | 21626 | 21626 | 2 | 21624 |
| noitre32 | 18133 | 18133 | 0 | 18133 |
| intuslegens | 13617 | 13617 | 0 | 13617 |
| ... | ... | ... | ... | ... |
| Salvato00506710 | 1 | 1 | 0 | 1 |
| Salviniunico | 1 | 1 | 0 | 1 |
| SalvoInUK | 1 | 1 | 0 | 1 |
| SamaRosa70 | 1 | 1 | 0 | 1 |
| zuhura05 | 1 | 1 | 0 | 1 |
5635 rows × 4 columns
n = 20
fig =px.histogram(dfRetweet.head(n),y=dfRetweet.head(n).index,x='real_rt',orientation='h')
fig.update_yaxes(title='username')
fig.update_layout(title="The most %d users retweeted (without self retweet)"%n)
fig.show()
sum(dfRetweet['real_rt'])
380325
dfReply = df.dropna(subset=['in_reply_to_screen_name']).copy()
dfReply.drop(columns=['rt_user_screen_name'],inplace=True,errors='ignore')
dfReply.drop(columns=['is_self_rt'],inplace=True,errors='ignore')
dfReply = dfReply.groupby('in_reply_to_screen_name').count()
dfReply.rename(columns={'user_screen_name':'count'},inplace=True,errors='ignore')
dfReply.sort_values('count',ascending=False,inplace=True)
dfReply
| id | count | |
|---|---|---|
| in_reply_to_screen_name | ||
| borghi_claudio | 4564 | 4564 |
| valy_s | 2973 | 2973 |
| MinervaMcGrani1 | 1887 | 1887 |
| cris_cersei | 1758 | 1758 |
| ladyonorato | 1628 | 1628 |
| ... | ... | ... |
| NotKayloss | 1 | 1 |
| NostraD64669142 | 1 | 1 |
| NormalitaNuova | 1 | 1 |
| Norbertus1860 | 1 | 1 |
| zucconideputato | 1 | 1 |
2784 rows × 2 columns
n = 20
fig =px.histogram(dfReply.head(n),y=dfReply.head(n).index,x='count',orientation='h')
fig.update_yaxes(title='username')
fig.update_layout(title="The most %d users replied"%n)
fig.show()
df = df_preparation(['is_self_rt'])
df1 = df.copy()
df1['sum_total_posts'] = 1
df1 = df1.groupby('user_screen_name').sum()
df = df.groupby('user_screen_name').count()
df['sum_total_posts'] = df1['sum_total_posts']
df['sum_self_rt'] = df1['is_self_rt']
del df1
df.rename(columns={'in_reply_to_screen_name':'num_reply','rt_user_screen_name':'num_rt'},inplace=True,errors='ignore')
df['num_reply'] = dfReply
df.loc[df['num_reply'].isna(),'num_reply'] = 0
df['num_rt'] = dfRetweet['real_rt']
df.loc[df['num_rt'].isna(),'num_rt'] = 0
df
| id | num_reply | num_rt | is_self_rt | sum_total_posts | sum_self_rt | |
|---|---|---|---|---|---|---|
| user_screen_name | ||||||
| 00000o0OOOO00 | 15 | 0.0 | 1.0 | 15 | 15.0 | 0 |
| 00650301 | 1 | 0.0 | 0.0 | 1 | 1.0 | 0 |
| 00_Maura | 1 | 0.0 | 0.0 | 1 | 1.0 | 0 |
| 0109Taiyo | 1 | 0.0 | 0.0 | 1 | 1.0 | 0 |
| 01nikks | 3 | 0.0 | 0.0 | 3 | 3.0 | 0 |
| ... | ... | ... | ... | ... | ... | ... |
| zucche_rosa | 1 | 0.0 | 0.0 | 1 | 1.0 | 0 |
| zucchi1911 | 1 | 0.0 | 0.0 | 1 | 1.0 | 0 |
| zucco52 | 2 | 0.0 | 0.0 | 2 | 2.0 | 0 |
| zxuz53 | 3 | 0.0 | 0.0 | 3 | 3.0 | 0 |
| zziocane66 | 116 | 0.0 | 0.0 | 116 | 116.0 | 0 |
24743 rows × 6 columns
df.describe()
| id | num_reply | num_rt | is_self_rt | sum_total_posts | sum_self_rt | |
|---|---|---|---|---|---|---|
| count | 24743.000000 | 24743.00000 | 24743.000000 | 24743.000000 | 24743.000000 | 24743.000000 |
| mean | 17.884129 | 1.57196 | 15.070606 | 17.884129 | 17.884129 | 0.055208 |
| std | 140.411696 | 48.21337 | 410.179670 | 140.411696 | 140.411696 | 3.410107 |
| min | 1.000000 | 0.00000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 |
| 25% | 1.000000 | 0.00000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 |
| 50% | 2.000000 | 0.00000 | 0.000000 | 2.000000 | 2.000000 | 0.000000 |
| 75% | 7.000000 | 0.00000 | 0.000000 | 7.000000 | 7.000000 | 0.000000 |
| max | 5835.000000 | 4564.00000 | 28977.000000 | 5835.000000 | 5835.000000 | 392.000000 |
df.sort_values('num_rt',ascending=False,inplace=True)
df = df.head(10)
df
| id | num_reply | num_rt | is_self_rt | sum_total_posts | sum_self_rt | |
|---|---|---|---|---|---|---|
| user_screen_name | ||||||
| MinervaMcGrani1 | 4176 | 1887.0 | 28977.0 | 4176 | 4176.0 | 0 |
| BarbaraRaval | 1371 | 841.0 | 21682.0 | 1371 | 1371.0 | 70 |
| valy_s | 3064 | 2973.0 | 21624.0 | 3064 | 3064.0 | 2 |
| noitre32 | 2045 | 1290.0 | 18133.0 | 2045 | 2045.0 | 0 |
| intuslegens | 60 | 450.0 | 13617.0 | 60 | 60.0 | 0 |
| borghi_claudio | 392 | 4564.0 | 13222.0 | 392 | 392.0 | 162 |
| pbecchi | 176 | 1102.0 | 12736.0 | 176 | 176.0 | 16 |
| miia_2018 | 1075 | 395.0 | 12375.0 | 1075 | 1075.0 | 0 |
| cris_cersei | 852 | 1758.0 | 11526.0 | 852 | 852.0 | 8 |
| ladyonorato | 507 | 1628.0 | 11410.0 | 507 | 507.0 | 4 |
n = 20
fig = make_subplots(rows=1, cols=1)
fig.add_trace(go.Bar(y=df.head(n).index, x=df.head(n)['num_reply'],orientation='h', name = 'Reply'), row=1, col=1)
fig.add_trace(go.Bar(y=df.head(n).index, x=df.head(n)['num_rt']-df.head(n)['sum_self_rt'],orientation='h', name = 'Retweet'), row=1, col=1)
fig.add_trace(go.Bar(y=df.head(n).index, x=df.head(n)['sum_self_rt'],orientation='h', name = 'Self retweet'), row=1, col=1)
fig.update_layout(title="How many time this users are retweeted")
fig.update_xaxes(title="Count")
fig.update_yaxes(title="Username")
fig.show()
for i in range(0,len(df),2):
fig = make_subplots(rows=1,cols=2,specs=[[{'type':'domain'}, {'type':'domain'}]])
names = ['Reply', 'Retweet','Tweet','Self retweet']
u = df.index[i]
if u in listSuspect:
values = [df.loc[u,'num_reply'], df.loc[u,'num_rt'] - df.loc[u,'sum_self_rt'],
df.loc[u,'sum_total_posts']-(df.loc[u,'num_reply']+df.loc[u,'num_rt']),
df.loc[u,'sum_self_rt']]
fig.add_trace(go.Pie(labels = names, values = values,title="%s"%u,textposition='inside')
,row=1,col=1)
u = df.index[i+1]
values = [df.loc[u,'num_reply'], df.loc[u,'num_rt'] - df.loc[u,'sum_self_rt'],
df.loc[u,'sum_total_posts']-(df.loc[u,'num_reply']+df.loc[u,'num_rt']),
df.loc[u,'sum_self_rt']]
fig.add_trace(go.Pie(labels = names, values = values,title="%s"%u,textposition='inside')
,row=1,col=2)
fig.update_layout(title="Post type division")
fig.update_traces(marker=dict(colors=['red', 'blue', 'green','cyan']))
fig.show()